Exploratory Data Analysis on Well Anderson_East_2-35¶
Importing Necessary Libraries¶
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
from scipy.stats import zscore
import plotly.io as pio
from IPython.display import Image
Loading the Dataset¶
In [2]:
# load dataset from csv file
df = pd.read_csv('./../wells/Anderson_East_2-35 Run Data.csv')
Analyzing the Dataset¶
In [3]:
df
Out[3]:
| Run Number | Well | Time - DROP | CP - DROP (psi) | TP - DROP (psi) | LP - DROP (psi) | Liquid Load - DROP (psi) | Fluid in Tubing - DROP (bbl) | Time - ARRIVING | CP - ARRIVING (psi) | ... | Lift Pressure - ARRIVING (psi) | Lift Factor - ARRIVING | Rise Time (m) | Rise Velocity (fpm) | Good | Miss | Open Duration (m) | Close Duration (m) | Gas | Liquid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1669 | Anderson_East_2-35 | 2/5/2025 23:08 | 271 | 62 | 50.76 | 209 | 2.67 | 2/5/2025 22:54 | 275 | ... | 228.44 | 1.43 | 9.066667 | 497.316176 | 1 | 0 | 13.550000 | 0.850000 | 0.14 | NaN |
| 1 | 1668 | Anderson_East_2-35 | 2/5/2025 22:53 | 273 | 61 | 50.05 | 212 | 2.70 | 2/5/2025 22:40 | 276 | ... | 223.93 | 1.40 | 8.250000 | 546.545455 | 1 | 0 | 13.750000 | 0.850000 | 0.12 | NaN |
| 2 | 1667 | Anderson_East_2-35 | 2/5/2025 22:39 | 275 | 65 | 54.55 | 210 | 2.68 | 2/5/2025 22:25 | 276 | ... | 222.89 | 1.40 | 8.100000 | 556.666667 | 1 | 0 | 13.716667 | 0.800000 | 0.14 | NaN |
| 3 | 1666 | Anderson_East_2-35 | 2/5/2025 22:24 | 275 | 67 | 56.24 | 208 | 2.65 | 2/5/2025 22:11 | 274 | ... | 220.80 | 1.38 | 8.133333 | 554.385246 | 1 | 0 | 13.300000 | 0.850000 | 0.13 | NaN |
| 4 | 1665 | Anderson_East_2-35 | 2/5/2025 22:10 | 273 | 67 | 56.25 | 206 | 2.63 | 2/5/2025 21:57 | 277 | ... | 224.33 | 1.34 | 7.966667 | 565.983264 | 1 | 0 | 13.233333 | 0.766667 | 0.13 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1664 | 5 | Anderson_East_2-35 | 1/10/2025 15:42 | 293 | 222 | 213.66 | 71 | 0.91 | 1/10/2025 15:38 | 300 | ... | 85.70 | 1.45 | 0.266667 | 16908.750000 | 1 | 0 | 4.166667 | 0.883333 | 0.00 | NaN |
| 1665 | 4 | Anderson_East_2-35 | 1/10/2025 15:37 | 299 | 223 | 215.22 | 76 | 0.97 | 1/10/2025 15:27 | 296 | ... | 79.75 | 1.56 | 9.033333 | 499.151292 | 1 | 0 | 10.083333 | 0.783333 | 0.00 | NaN |
| 1666 | 3 | Anderson_East_2-35 | 1/10/2025 15:26 | 296 | 226 | 217.86 | 70 | 0.89 | 1/10/2025 15:25 | 296 | ... | 79.30 | 1.50 | 0.266667 | 16908.750000 | 1 | 0 | 1.666667 | 0.783333 | 0.00 | NaN |
| 1667 | 2 | Anderson_East_2-35 | 1/10/2025 15:24 | 294 | 227 | 219.75 | 67 | 0.85 | 1/10/2025 15:14 | 297 | ... | 76.20 | 1.47 | 8.466667 | 532.559055 | 1 | 0 | 9.416667 | 0.883333 | 0.00 | NaN |
| 1668 | 1 | Anderson_East_2-35 | 1/10/2025 15:13 | 295 | 230 | 222.06 | 65 | 0.83 | 1/10/2025 15:11 | 294 | ... | 73.71 | 1.60 | 0.266667 | 16908.750000 | 1 | 0 | 2.200000 | 0.783333 | 0.00 | NaN |
1669 rows × 22 columns
In [4]:
df.describe()
Out[4]:
| Run Number | CP - DROP (psi) | TP - DROP (psi) | LP - DROP (psi) | Liquid Load - DROP (psi) | Fluid in Tubing - DROP (bbl) | CP - ARRIVING (psi) | TP - ARRIVING (psi) | LP - ARRIVING (psi) | Lift Pressure - ARRIVING (psi) | Lift Factor - ARRIVING | Rise Time (m) | Rise Velocity (fpm) | Good | Miss | Open Duration (m) | Close Duration (m) | Gas | Liquid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 1669.000000 | 0.0 |
| mean | 835.000000 | 287.444578 | 94.483523 | 83.541216 | 192.961055 | 2.461744 | 288.486519 | 141.820851 | 80.665392 | 207.821132 | 1.818922 | 9.285191 | 1286.957572 | 0.944877 | 0.055123 | 19.229239 | 3.489305 | 0.105177 | NaN |
| std | 481.943116 | 22.467511 | 45.289667 | 46.907007 | 47.048463 | 0.600342 | 21.934185 | 47.159301 | 46.853570 | 48.023197 | 3.312858 | 8.366285 | 3407.361456 | 0.228288 | 0.228288 | 119.857887 | 66.099724 | 0.719213 | NaN |
| min | 1.000000 | 264.000000 | 7.000000 | 0.020000 | 44.000000 | 0.560000 | 265.000000 | 7.000000 | 0.020000 | 55.190000 | 0.880000 | 0.166667 | 64.862143 | 0.000000 | 0.000000 | 0.983333 | 0.750000 | -29.190000 | NaN |
| 25% | 418.000000 | 277.000000 | 70.000000 | 57.200000 | 197.000000 | 2.510000 | 279.000000 | 116.000000 | 54.280000 | 211.930000 | 1.340000 | 6.666667 | 518.275862 | 1.000000 | 0.000000 | 11.916667 | 0.800000 | 0.120000 | NaN |
| 50% | 835.000000 | 281.000000 | 75.000000 | 63.740000 | 206.000000 | 2.630000 | 282.000000 | 124.000000 | 60.930000 | 222.340000 | 1.390000 | 7.866667 | 573.177966 | 1.000000 | 0.000000 | 12.966667 | 0.850000 | 0.130000 | NaN |
| 75% | 1252.000000 | 287.000000 | 87.000000 | 76.230000 | 213.000000 | 2.720000 | 288.000000 | 139.000000 | 73.960000 | 228.910000 | 1.470000 | 8.700000 | 676.350000 | 1.000000 | 0.000000 | 14.283333 | 0.866667 | 0.150000 | NaN |
| max | 1669.000000 | 466.000000 | 280.000000 | 272.890000 | 459.000000 | 5.860000 | 467.000000 | 340.000000 | 273.210000 | 427.810000 | 54.110000 | 69.516667 | 27054.000000 | 1.000000 | 1.000000 | 4805.400000 | 2695.433333 | 0.970000 | NaN |
Preprocessing the Dataset¶
Shifting the Dataset¶
In [5]:
df_z = pd.read_csv('./../wells/Anderson_East_2-35 Run Data.csv')
# shift CP-DROP, TP-DROP, LP-DROP by -1 to match the data
df_z['CP - DROP (psi)'] = df_z['CP - DROP (psi)'].shift(-1)
df_z['TP - DROP (psi)'] = df_z['TP - DROP (psi)'].shift(-1)
df_z['LP - DROP (psi)'] = df_z['LP - DROP (psi)'].shift(-1)
df_z['Liquid Load - DROP (psi)'] = df_z['Liquid Load - DROP (psi)'].shift(-1)
df_z['Fluid in Tubing - DROP (bbl)'] = df_z['Fluid in Tubing - DROP (bbl)'].shift(-1)
# drop column liquid load
df_z = df_z.drop(columns=['Liquid'])
df_z = df_z.dropna()
Removing Outliers from Dataset¶
In [6]:
all_columns = df_z.columns
# features to retain
retain_columns = ['Time - DROP', 'Time - ARRIVING', 'Run Number', "Well"]
# Identify numerical features (excluding retained ones)
numeric_columns = df_z.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col not in retain_columns]
# Compute Z-scores for numerical columns
z_scores = df_z[numeric_columns].apply(zscore)
# Define outlier mask (threshold |Z| > 3)
outlier_mask = (z_scores.abs() > 3)
outlier_rows = outlier_mask.any(axis=1)
# mask for gas production being zero
gas_zero_mask = (df_z['Gas'] == 0)
# Replace outlier rows' non-retained columns with NaN
df_z.loc[outlier_rows, numeric_columns] = np.nan
# Replacing all features except 'Time - Drop' and 'Time - Arriving' with NaN when 'Gas' is zero
df_z.loc[gas_zero_mask, [col for col in all_columns if col not in retain_columns]] = np.nan
# Save the cleaned dataset
df_z.to_csv('cleaned_well_data_Z.csv', index=False)
In [7]:
df_z.describe()
Out[7]:
| Run Number | CP - DROP (psi) | TP - DROP (psi) | LP - DROP (psi) | Liquid Load - DROP (psi) | Fluid in Tubing - DROP (bbl) | CP - ARRIVING (psi) | TP - ARRIVING (psi) | LP - ARRIVING (psi) | Lift Pressure - ARRIVING (psi) | Lift Factor - ARRIVING | Rise Time (m) | Rise Velocity (fpm) | Good | Miss | Open Duration (m) | Close Duration (m) | Gas | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1668.00000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.000000 | 1451.0 | 1451.0 | 1451.000000 | 1451.000000 | 1451.000000 |
| mean | 835.50000 | 281.390076 | 83.884218 | 72.384163 | 197.505858 | 2.519759 | 282.556168 | 129.773260 | 69.627567 | 212.928608 | 1.403653 | 7.718148 | 666.802582 | 1.0 | 0.0 | 15.121089 | 0.834229 | 0.135899 |
| std | 481.65444 | 6.493350 | 30.245120 | 31.674049 | 31.459538 | 0.401364 | 6.530996 | 25.110767 | 31.813109 | 33.139636 | 0.098583 | 2.098859 | 570.025033 | 0.0 | 0.0 | 12.969100 | 0.035514 | 0.020942 |
| min | 2.00000 | 264.000000 | 58.000000 | 45.590000 | 55.000000 | 0.700000 | 265.000000 | 94.000000 | 41.910000 | 67.550000 | 1.190000 | 0.433333 | 182.059219 | 1.0 | 0.0 | 1.566667 | 0.750000 | 0.020000 |
| 25% | 418.75000 | 277.000000 | 69.000000 | 56.615000 | 200.000000 | 2.550000 | 278.000000 | 115.000000 | 53.785000 | 215.530000 | 1.340000 | 6.800000 | 532.559055 | 1.0 | 0.0 | 12.100000 | 0.800000 | 0.120000 |
| 50% | 835.50000 | 280.000000 | 74.000000 | 62.540000 | 207.000000 | 2.640000 | 281.000000 | 123.000000 | 59.710000 | 222.820000 | 1.390000 | 7.850000 | 574.394905 | 1.0 | 0.0 | 12.950000 | 0.850000 | 0.140000 |
| 75% | 1252.25000 | 285.000000 | 80.000000 | 68.465000 | 212.000000 | 2.700000 | 286.000000 | 133.000000 | 65.830000 | 228.590000 | 1.450000 | 8.466667 | 663.088235 | 1.0 | 0.0 | 14.016667 | 0.850000 | 0.150000 |
| max | 1669.00000 | 307.000000 | 223.000000 | 215.370000 | 238.000000 | 3.040000 | 308.000000 | 243.000000 | 215.450000 | 254.830000 | 2.270000 | 24.766667 | 10405.384620 | 1.0 | 0.0 | 177.933333 | 0.916667 | 0.200000 |
Feature Engineering¶
In [8]:
# Deriving Gas Rate
df_z['GasRate'] = df_z['Gas'] / df_z['Open Duration (m)']
# Deriving Fluid Rate
df_z['Fluidrate (bbl/m)'] = df_z['Fluid in Tubing - DROP (bbl)'] / df_z['Open Duration (m)']
# df_z = df_z.dropna()
# Deriving change in Lift Pressure Rate
df_z['Lift Pressure - DROP (psi)'] = df_z['CP - DROP (psi)'] - df_z['LP - DROP (psi)']
df_z['UpwardRate (psi/m)'] = (df_z['Lift Pressure - ARRIVING (psi)'] - df_z['Lift Pressure - DROP (psi)'])/ df_z['Close Duration (m)']
df_z.to_csv('cleaned_well_data_Z.csv', index=False)
Visualizing the Trends with Derived Features¶
In [9]:
# Create Plotly figure
fig = go.Figure()
# Add scatter plot (point cloud)
fig.add_trace(go.Scatter(
x=df_z['TP - DROP (psi)'],
y=df_z['GasRate'],
mode='markers',
marker=dict(size=6, color='green', opacity=1),
name='Gas Rate'
))
# Customize layout
fig.update_layout(
title='Gas Rate vs Tubing Pressure',
xaxis_title='TP - DROP (psi)',
yaxis_title='Gas Rate',
# xaxis_range=[60, 4000], # Set x-axis limits (adjust as needed)
template='plotly_white'
)
# Save the figure as a static image
fig.write_image('gas_rate_vs_tubing_pressure.png')
# Display the static image in the notebook
display(Image('gas_rate_vs_tubing_pressure.png'))
In [10]:
# Create Plotly figure
fig = go.Figure()
# Add scatter plot (point cloud)
fig.add_trace(go.Scatter(
x=df_z['Rise Time (m)'],
y=df_z['GasRate'],
mode='markers',
marker=dict(size=6, color='blue', opacity=1),
name='Gas Rate'
))
# Customize layout
fig.update_layout(
title='Gas Rate vs Rise Time',
xaxis_title='Rise Time (m)',
yaxis_title='Gas Rate',
# xaxis_range=[60, 4000], # Set x-axis limits (adjust as needed)
template='plotly_white'
)
# Show plot
fig.show()
In [11]:
# Create Plotly figure
fig = go.Figure()
# Add scatter plot (point cloud)
fig.add_trace(go.Scatter(
x=df_z['Rise Velocity (fpm)'],
y=df_z['GasRate'],
mode='markers',
marker=dict(size=6, color='red', opacity=1),
name='Gas Rate'
))
# Customize layout
fig.update_layout(
title='Gas Rate vs Rise Velocity',
xaxis_title='Rise Velocity (fpm)',
yaxis_title='Gas Rate',
xaxis_range=[60, 4000], # Set x-axis limits (adjust as needed)
template='plotly_white'
)
# Show plot
fig.show()
In [12]:
# Create Plotly figure
fig = go.Figure()
# Add scatter plot (point cloud)
fig.add_trace(go.Scatter(
x=df_z['Open Duration (m)'],
y=df_z['GasRate'],
mode='markers',
marker=dict(size=6, color='green', opacity=1),
name='Gas Rate'
))
# Customize layout
fig.update_layout(
title='Gas Rate vs Open Duration',
xaxis_title='Open Duration (m)',
yaxis_title='Gas Rate',
xaxis_range=[0, 80], # Set x-axis limits (adjust as needed)
template='plotly_white'
)
# Show plot
fig.show()
In [13]:
# Create Plotly figure
fig = go.Figure()
# Add scatter plot
fig.add_trace(go.Scatter(
x=df_z['Rise Velocity (fpm)'],
y=df_z['Rise Time (m)'],
mode='markers',
marker=dict(size=6, color='red', opacity=0.7),
name='Rise Time vs Velocity'
))
# Customize layout
fig.update_layout(
title='Rise Time vs Rise Velocity',
xaxis_title='Rise Velocity (fpm)',
yaxis_title='Rise Time (m)',
template='plotly_white'
)
# Show plot
fig.show()
In [14]:
# Create Plotly figure
fig = go.Figure()
# Add scatter plot (point cloud)
fig.add_trace(go.Scatter(
x=df_z['Liquid Load - DROP (psi)'],
y=df_z['GasRate'],
mode='markers',
marker=dict(size=6, color='blue', opacity=1),
name='Gas Rate'
))
# Customize layout
fig.update_layout(
title='Liquid Load vs Gas Rate',
xaxis_title='Liquid Load - DROP (psi)',
yaxis_title='Gas Rate',
# xaxis_range=[0, 50], # Set x-axis limits (adjust as needed)
template='plotly_white'
)
# Show plot
fig.show()
In [15]:
# drop column form df
df_z = df_z.drop(columns=['Good', 'Miss'])
plt.figure(figsize=(10, 6))
sns.heatmap(df_z.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()